import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
data1=pd.read_excel("gold2021.xlsx")
data2=pd.read_excel("gold2020.xlsx")
data3=pd.read_excel("gold2019.xlsx")
data4=pd.read_excel("gold2018.xlsx")
data=pd.concat([data1,data2,data3,data4])
data=data.sort_values(by='DATE', ascending=True)
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
data.dtypes
DATE datetime64[ns] Price int64 dtype: object
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 758 entries, 38 to 0 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DATE 758 non-null datetime64[ns] 1 Price 758 non-null int64 dtypes: datetime64[ns](1), int64(1) memory usage: 17.8 KB
data.describe()
| Price | |
|---|---|
| count | 758.000000 |
| mean | 43096.828496 |
| std | 6789.493745 |
| min | 30289.000000 |
| 25% | 36082.500000 |
| 50% | 45162.000000 |
| 75% | 48660.000000 |
| max | 56834.000000 |
Check for normality in data¶
sns.distplot(data['Price'])
plt.axvline(x=np.mean(data['Price']), c='red', ls='--', label='mean')
plt.axvline(x=np.percentile(data['Price'],25),c='green', ls='--', label = '25th percentile:Q1')
plt.axvline(x=np.percentile(data['Price'],75),c='orange', ls='--',label = '75th percentile:Q3' )
plt.legend()
<matplotlib.legend.Legend at 0x1d985b5ae50>
plt.hist(data["Price"])
plt.show()
Find Outliers in data¶
data.boxplot(column=["Price"])
<AxesSubplot:>
Check Trend in data¶
plt.figure(figsize=(15,6))
series1=sns.lineplot(x='DATE',y='Price',data=data)
plt.show(series1)
This data have irregular trend with combination of linear & constant treand over the time¶
Check trend using plotly lib¶
import plotly.graph_objects as go
fig = go.Figure(data=go.Scatter(x=data['DATE'],y=data['Price'], mode='lines'))
fig.show()
mode='lines', If we set the mode='markers', then we’ll have a regular scatter (dots) plot. There’s also another mode='lines+markers' that shows both dots and lines¶
These charts are interactive, you can hover the mouse over the chart and see the price details
Using calplot (Visualization like GITHUB)¶
import calplot
#assigning date column as its index
df=data
df=df.set_index('DATE')
#visualize time series
calplot.calplot(df['Price'],edgecolor=None)
findfont: Font family ['Helvetica'] not found. Falling back to DejaVu Sans.
(<Figure size 900x489.6 with 5 Axes>,
array([<AxesSubplot:ylabel='2018'>, <AxesSubplot:ylabel='2019'>,
<AxesSubplot:ylabel='2020'>, <AxesSubplot:ylabel='2021'>],
dtype=object))
Trend & Seasonality using Seaborn lib¶
data['Year'] = data['DATE'].dt.year
data['Month'] = data['DATE'].dt.month
sns.factorplot(data = data, x ="Month", y = "Price", hue = "Year")
<seaborn.axisgrid.FacetGrid at 0x1d986267580>
Let visualizing the sum of all sales each year. We can do that using group of “Price” and “Date” and group by “Year”.¶
df=data
df['year'] = df['DATE'].dt.year
df['month'] = df['DATE'].dt.month
df['day'] = df['DATE'].dt.day
df['weekday'] = df['DATE'].dt.weekday
df['weekday'] = np.where(df.weekday == 0, 7, df.weekday)
df_year = data[['Price','Year']].groupby(by='Year').sum().reset_index()
df_year
| Year | Price | |
|---|---|---|
| 0 | 2018 | 1226479 |
| 1 | 2019 | 8572844 |
| 2 | 2020 | 11362886 |
| 3 | 2021 | 11505187 |
sns.catplot(x='Year',y='Price',data=df_year,kind='bar',aspect=2)
<seaborn.axisgrid.FacetGrid at 0x1d98369c850>
Monthly & Yeary Gold Price (TREND & SEASONALITY)¶
# Draw Plot
fig, axes = plt.subplots(1, 2, figsize=(20,7), dpi= 80)
sns.boxplot(x='Year', y='Price', data=data, ax=axes[0])
sns.boxplot(x='Month', y='Price', data=data.loc[~data.Year.isin([2018, 2021]), :])
# Set Title
axes[0].set_title('Year-wise Box Plot\n(The Trend)', fontsize=18);
axes[1].set_title('Month-wise Box Plot\n(The Seasonality)', fontsize=18)
plt.show()
Inference: From the number of sales vs. year plot, we can infer an increasing trend over the years. The aggregate number of sales has increased from approximately 32000 in 2018 to slightly over 52000 in 2020. A clear trend is captured by the lineplot above¶
Weekly Gold Price¶
plot = sns.boxplot(x='weekday', y='Price', data=df)
plot.set(title='Weekly Gold Price')
[Text(0.5, 1.0, 'Weekly Gold Price')]
Time Series Decomposition Plot¶
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib
data=pd.concat([data1,data2,data3,data4])
data=data.sort_values(by='DATE', ascending=True)
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
data=data.set_index('DATE')
df1=data.sort_values(by='DATE', ascending=True)
matplotlib.rc('figure', figsize=(7, 7))
decompose = seasonal_decompose(df1.Price, period=10, model='additive')
decompose.plot()
plt.show()
Determine Error, Trend and Seasonality An ETS model has three main components: error, trend, and seasonality. Each can be applied either additively, multiplicatively, or not at all. We will use the above Times Series Decomposition Plot to determine the additive or multiplicative property of the thre components.
Trend - If the trend plot is linear then we apply it additively (A). If the trend line grows or shrinks exponentially, we apply it multiplicatively (M). If there is no clear trend, no trend component is included (N).
Seasonal - If the peaks and valleys for seasonality are constant over time, we apply it additively (A). If the size of the seasonal fluctuations tends to increase or decrease with the level of time series, we apply it multiplicatively (M). If there is no seasonality, it is not applied (N).
Error - If the error plot has constant variance over time (peaks and valleys are about the same size), we apply it additively (A). If the error plot is fluctuating between large and small errors over time, we apply it multiplicatively (M).
Check Stationarity of a Time Series¶
data=pd.concat([data1,data2,data3,data4])
data=data.sort_values(by='DATE', ascending=True)
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
data=data.set_index('DATE')
data
| Price | |
|---|---|
| DATE | |
| 2018-10-12 | 31876 |
| 2018-10-15 | 32264 |
| 2018-10-16 | 32109 |
| 2018-10-17 | 32043 |
| 2018-10-18 | 32037 |
| ... | ... |
| 2021-11-29 | 48042 |
| 2021-11-30 | 48085 |
| 2021-12-01 | 47633 |
| 2021-12-02 | 47402 |
| 2021-12-03 | 47475 |
758 rows × 1 columns
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
#Determing rolling statistics
rolmean = timeseries.rolling(12).mean()
rolstd = timeseries.rolling(12).std()
#Plot rolling statistics:
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
test_stationarity(data)
Though the variation in standard deviation is small, mean is clearly increasing with time and this is not a stationary series
jupyter nbconvert --to html Forecast_EDA.ipynb